library(plotly)
library(dplyr)
library(readxl)
library(tidyverse)
library(caret)
Movies_raw <- read_xlsx("Bechdel.xlsx", col_names = TRUE)
Movies_test <- read_csv("test.csv", col_names = TRUE) # Data to predict
Movies <- Movies_raw
names(Movies) <- c("year", "imdb", "budget", "domgross", "intgross", "code","budget_2013", "domgross_2013",
"intgross_2013", "Test")
#str(Movies)
Movies <- as.data.frame(Movies[-1, -c(11, 12)])
Movies_num <- Movies[, -2]
func <- function(x){
as.numeric(gsub("PASS", 1, gsub("FAIL", 0, x)))
}
Movies_num <- as.data.frame(lapply(Movies_num, FUN = func))
#Remove id column
Mov_code <- Movies[,2]
# Remove NA from data set
Movies_num <- Movies_num[complete.cases(Movies_num), ]
# Add provit variable (gross less budget) to see if it adds any insight
# assuming that budget = real cost to produce movies
Movies_num$domProfit <- Movies_num$domgross - Movies_num$budget
Movies_num$intProfit <- Movies_num$intgross - Movies_num$budget
Movies_num$domProfit_2013 <- Movies_num$domgross_2013 - Movies_num$budget_2013
Movies_num$intProfit_2013 <- Movies_num$intgross_2013 - Movies_num$budget_2013
# Reorder columns and remove NA
Movies_num <- Movies_num[ ,c(1, 2, 3, 4, 10, 11, 6, 7, 8, 12, 13, 9)]
Movies_clean <- Movies_num[complete.cases(Movies_num), ]
# Summarize data
summary(Movies_clean)
## year budget domgross
## Min. :1970 Min. : 7000 Min. : 828
## 1st Qu.:1998 1st Qu.: 12000000 1st Qu.: 16244304
## Median :2005 Median : 29000000 Median : 42704878
## Mean :2003 Mean : 45555379 Mean : 70440928
## 3rd Qu.:2009 3rd Qu.: 63000000 3rd Qu.: 95052518
## Max. :2013 Max. :425000000 Max. :760507625
## intgross domProfit intProfit
## Min. :8.280e+02 Min. :-201941321 Min. :-110450242
## 1st Qu.:2.583e+07 1st Qu.: -7210930 1st Qu.: 5235822
## Median :7.875e+07 Median : 8541081 Median : 43905789
## Mean :1.545e+08 Mean : 24885549 Mean : 108981403
## 3rd Qu.:1.986e+08 3rd Qu.: 42571574 3rd Qu.: 135132192
## Max. :2.784e+09 Max. : 458672302 Max. :2358918982
## budget_2013 domgross_2013 intgross_2013
## Min. : 8632 Min. :8.990e+02 Min. :8.990e+02
## 1st Qu.: 16143738 1st Qu.:2.022e+07 1st Qu.:3.278e+07
## Median : 36995786 Median :5.608e+07 Median :9.933e+07
## Mean : 56311299 Mean :9.719e+07 Mean :2.037e+08
## 3rd Qu.: 81684071 3rd Qu.:1.255e+08 3rd Qu.:2.467e+08
## Max. :461435929 Max. :1.772e+09 Max. :3.172e+09
## domProfit_2013 intProfit_2013 Test
## Min. :-204897453 Min. :-120328632 Min. :0.0000
## 1st Qu.: -8798028 1st Qu.: 6347922 1st Qu.:0.0000
## Median : 10831363 Median : 56550302 Median :0.0000
## Mean : 40879050 Mean : 147396397 Mean :0.4474
## 3rd Qu.: 55015145 3rd Qu.: 169980582 3rd Qu.:1.0000
## Max. :1729408181 Max. :3024171833 Max. :1.0000
# Look at how data id distributed by budget, gross sales, profit
hist_l = list()
hist_p <- function(df){
for(i in 1:10){
hist_l[i] <- plot_ly(x = df[ ,i], type = "histogram", name = names(df)[i])
}
return(hist_l)
}
hist_pl <- hist_p(Movies_clean)
unadjusted <- subplot(hist_pl[[2]], hist_pl[[3]], hist_pl[[4]], hist_pl[[5]], hist_pl[[1]], nrows = 4, margin = 0.04,
heights = c(0.25, 0.25, 0.25, 0.25))
adjusted <- subplot(hist_pl[[7]], hist_pl[[8]], hist_pl[[9]], nrows = 3, margin = 0.04, heights = c(0.33, 0.33, 0.33))
unadjusted
adjusted
It can be seen that the the variables roughly follow a Poisson distribution, the adjusted 2013 variables have higher means and it follows that the distributions are less skewed when compared to the unadjusted variables.
# Look at the distribution of the data
box_l <- list()
box_p <- function(df){
for(i in 1:11){
box_l[i] <- plot_ly(y = df[,i], type = "box", name = names(df)[i])
}
return(box_l)
}
str(Movies_clean)
## 'data.frame': 1484 obs. of 12 variables:
## $ year : num 1974 1982 2008 2011 2000 ...
## $ budget : num 1.30e+07 1.07e+07 1.50e+07 1.20e+07 2.60e+07 2.00e+07 3.00e+08 8.00e+07 2.00e+06 1.60e+07 ...
## $ domgross : num 57300000 74706019 2203641 16311571 45506619 ...
## $ intgross : num 5.73e+07 1.22e+08 6.38e+06 2.28e+07 7.58e+07 ...
## $ domProfit : num 44300000 64006019 -12796359 4311571 19506619 ...
## $ intProfit : num 44300000 111006019 -8620425 10750356 49763814 ...
## $ budget_2013 : num 61408439 25821968 16233845 12428289 35175618 ...
## $ domgross_2013 : num 2.71e+08 1.80e+08 2.38e+06 1.69e+07 6.16e+07 ...
## $ intgross_2013 : num 2.71e+08 2.94e+08 6.90e+06 2.36e+07 1.03e+08 ...
## $ domProfit_2013: num 209261066 154463677 -13848941 4465455 26390668 ...
## $ intProfit_2013: num 209261066 267887273 -9329510 11134045 67325881 ...
## $ Test : num 1 1 1 1 0 0 1 0 1 1 ...
box_pl <- box_p(Movies_clean)
subplot(box_pl[[2]], box_pl[[3]], box_pl[[4]], box_pl[[5]], box_pl[[6]])
subplot(box_pl[[7]], box_pl[[8]], box_pl[[9]], box_pl[[10]], box_pl[[11]])
# Percent of movies that passed test all time
percent_pass <- sum(Movies_num$Test)/length(Movies$Test) * 100
Movies_order <- Movies_clean[order(Movies_clean$year),]
Movies_order <- Movies_order[, c(1, 12)]
sum_yr <- aggregate(Movies_order$Test, by=list(Year = Movies_order$year), FUN = sum)
sum_yr$Pass <- sum_yr$x
library(plyr)
freq <- count(Movies_order, "year")
Change_percent <- cbind(sum_yr, `Percent pass Berchdel test` = (sum_yr$x/freq$freq)*100)
succes_time <- plot_ly(sum_yr, x = ~Year, y = ~Pass,
type = "scatter", mode = "markers+lines")
percent_time <- plot_ly(Change_percent, x = ~Year, y = ~`Percent pass Berchdel test`,
type = "scatter", mode = "markers+lines", color = "orange")
succes_time
percent_time
Bassed on the binary definition 44.2666667 \(\%\) of movies passed the Berchdel test. When the percentage pass is examined overtime it can be seen that there is a slight increase in movies that pass the Berchdel test since 1970. It is however notable that the quantity of movies produced also increased over the period.
Movies_clean$Test <- factor(Movies_clean$Test, levels = c(0, 1))
str(Movies_clean)
## 'data.frame': 1484 obs. of 12 variables:
## $ year : num 1974 1982 2008 2011 2000 ...
## $ budget : num 1.30e+07 1.07e+07 1.50e+07 1.20e+07 2.60e+07 2.00e+07 3.00e+08 8.00e+07 2.00e+06 1.60e+07 ...
## $ domgross : num 57300000 74706019 2203641 16311571 45506619 ...
## $ intgross : num 5.73e+07 1.22e+08 6.38e+06 2.28e+07 7.58e+07 ...
## $ domProfit : num 44300000 64006019 -12796359 4311571 19506619 ...
## $ intProfit : num 44300000 111006019 -8620425 10750356 49763814 ...
## $ budget_2013 : num 61408439 25821968 16233845 12428289 35175618 ...
## $ domgross_2013 : num 2.71e+08 1.80e+08 2.38e+06 1.69e+07 6.16e+07 ...
## $ intgross_2013 : num 2.71e+08 2.94e+08 6.90e+06 2.36e+07 1.03e+08 ...
## $ domProfit_2013: num 209261066 154463677 -13848941 4465455 26390668 ...
## $ intProfit_2013: num 209261066 267887273 -9329510 11134045 67325881 ...
## $ Test : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 2 1 2 2 ...
table(Movies_clean$Test)
##
## 0 1
## 820 664
# Prep Training and Test data.
set.seed(50)
split_Movies <- createDataPartition(Movies_clean$Test, p=0.85, list = F)
train <- Movies_clean[split_Movies, ]
test_split <- Movies_clean[-split_Movies, ]
table(train$Test)
##
## 0 1
## 697 565
set.seed(50)
scale_train_down <- downSample(x = train[, 2:11], y = train$Test, list = FALSE, yname = "Test")
table(scale_train_down$Test)
##
## 0 1
## 565 565
set.seed(50)
scale_train_up <- upSample(x = train[, 2:11], y = train$Test, list = FALSE, yname = "Test")
table(scale_train_up$Test)
##
## 0 1
## 697 697
# Examine different models and choose appropiate based on p-value
Movies_glm <- glm(Test ~ budget + domgross + intgross + domProfit + intProfit
+ budget_2013 + domgross_2013 + intgross_2013 + domProfit_2013
+ intProfit_2013, data = scale_train_up, family = binomial)
Movies_glm_2 <- glm(Test ~ budget + domgross + intgross
+ budget_2013 + domgross_2013 + intgross_2013,
data = scale_train_up, family = binomial)
Movies_glm_1 <- glm(Test ~ budget + domgross + budget_2013 + domgross_2013,
data = scale_train_up, family = binomial)
summary(Movies_glm)
##
## Call:
## glm(formula = Test ~ budget + domgross + intgross + domProfit +
## intProfit + budget_2013 + domgross_2013 + intgross_2013 +
## domProfit_2013 + intProfit_2013, family = binomial, data = scale_train_up)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5082 -1.1906 0.4406 1.0991 2.0967
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.725e-01 8.514e-02 4.375 1.21e-05 ***
## budget 5.084e-09 7.227e-09 0.703 0.481793
## domgross 2.047e-08 6.155e-09 3.325 0.000884 ***
## intgross -8.838e-09 3.176e-09 -2.783 0.005389 **
## domProfit NA NA NA NA
## intProfit NA NA NA NA
## budget_2013 -1.111e-08 6.057e-09 -1.834 0.066653 .
## domgross_2013 -1.693e-08 4.565e-09 -3.709 0.000208 ***
## intgross_2013 7.685e-09 2.439e-09 3.151 0.001625 **
## domProfit_2013 NA NA NA NA
## intProfit_2013 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1932.5 on 1393 degrees of freedom
## Residual deviance: 1877.2 on 1387 degrees of freedom
## AIC: 1891.2
##
## Number of Fisher Scoring iterations: 4
summary(Movies_glm_1)
##
## Call:
## glm(formula = Test ~ budget + domgross + budget_2013 + domgross_2013,
## family = binomial, data = scale_train_up)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4208 -1.1895 0.3918 1.1129 2.5284
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.244e-01 8.011e-02 4.049 5.14e-05 ***
## budget -4.076e-09 6.595e-09 -0.618 0.53654
## domgross 6.396e-09 2.180e-09 2.934 0.00334 **
## budget_2013 -3.472e-09 5.618e-09 -0.618 0.53649
## domgross_2013 -4.209e-09 1.421e-09 -2.963 0.00305 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1932.5 on 1393 degrees of freedom
## Residual deviance: 1889.5 on 1389 degrees of freedom
## AIC: 1899.5
##
## Number of Fisher Scoring iterations: 4
summary(Movies_glm_2) #Best model so far
##
## Call:
## glm(formula = Test ~ budget + domgross + intgross + budget_2013 +
## domgross_2013 + intgross_2013, family = binomial, data = scale_train_up)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5082 -1.1906 0.4406 1.0991 2.0967
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.725e-01 8.514e-02 4.375 1.21e-05 ***
## budget 5.084e-09 7.227e-09 0.703 0.481793
## domgross 2.047e-08 6.155e-09 3.325 0.000884 ***
## intgross -8.838e-09 3.176e-09 -2.783 0.005389 **
## budget_2013 -1.111e-08 6.057e-09 -1.834 0.066653 .
## domgross_2013 -1.693e-08 4.565e-09 -3.709 0.000208 ***
## intgross_2013 7.685e-09 2.439e-09 3.151 0.001625 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1932.5 on 1393 degrees of freedom
## Residual deviance: 1877.2 on 1387 degrees of freedom
## AIC: 1891.2
##
## Number of Fisher Scoring iterations: 4
# Prediction using model
pred <- predict(Movies_glm_2, newdata = test_split, type = "response")
pred_test <- ifelse(pred > 0.5, 1, 0)
Test_pred <- factor(pred_test, levels=c(0, 1))
Test_act <- factor(test_split$Test)
Pred_pro <- mean(Test_pred == Test_act)
# Predict test.csv movies data set using model
pred_N <- predict(Movies_glm_2, newdata = Movies_test, type = "response")
pred_test_csv <- ifelse(pred_N > 0.5, 1, 0)
Test_pred_csv <- factor(pred_test_csv, levels=c(0, 1))
Movies_New <- as.data.frame(cbind(Movies_test, Test_pred_csv))
func <- function(x){
gsub(1, "PASS", gsub(0, "FAIL", x))
}
Movies_Pred <- as.data.frame(lapply(Movies_New$Test_pred_csv, FUN = func))
head(Movies_New)
## year imdb budget domgross intgross code budget_2013
## 1 2013 tt1559547 5.00e+07 19452138 55940671 2013PASS 5.00e+07
## 2 2013 tt0765446 4.00e+07 57012977 66454811 2013PASS 4.00e+07
## 3 2013 tt1211956 7.00e+07 25213103 103813103 2013FAIL 7.00e+07
## 4 2013 tt1351685 1.95e+08 65187603 197387603 2013FAIL 1.95e+08
## 5 2013 tt1650554 2.80e+07 28795985 60839197 2013PASS 2.80e+07
## 6 2013 tt2002718 2.00e+06 8008161 9408161 2013PASS 2.00e+06
## domgross_2013 intgross_2013 Test_pred_csv
## 1 19452138 55940671 1
## 2 57012977 66454811 1
## 3 25213103 103813103 0
## 4 65187603 197387603 0
## 5 28795985 60839197 1
## 6 8008161 9408161 1
# Save predicted movie dasta set
save(x = Movies_Pred, file = "Movies_Pred.csv")
After prediction the accuracy was calculated to be 0.5765766 which is not very accurate. Other methods may be used that would posibly give greater accuracy.